## This program takes the mwu file and converts reads and ratio into a fitness value
## Requires one argument: mwu
## > python mwu_to_cluster.py mwu_sample.txt fdr_sample.txt > clusteroutput.txt
## 

import operator, random, sys

## Create Dictionary with genes and P-vals {gene:[p-val],...}

gene_dict = {}

#find total reads in ctrl and exp
readcountctrl = 0
readcountexp = 0
for line in open(sys.argv[1]):
    split = line.split('\t')
    #print split
    if split[0][0:3]=='SAO':
        try:
            readcountctrl = readcountctrl + int(split[2])
            readcountexp = readcountexp + int(split[3])
        except Exception:
            pass


mincountctrl = float(readcountctrl)/10000
mincountexp = float(readcountexp)/10000

#put important info in a dictionary: 0-#TA sites, 1-length, 2-readsctrl,3-readsexp, 4-rawratio, 5-modratio, 6-index, 7-rawfitscore
for line in open(sys.argv[1]):
    split = line.split('\t')
    if split[0][0:3]=='SAO' and int(split[1])>=0:
        try:
            readsctrl=int(split[2])
        except Exception:
            pass
        if readsctrl < mincountctrl: readsctrl = mincountctrl
        try:
            readsexp = int(split[3])
        except Exception:
            pass
        if readsexp <mincountexp: readsexp = mincountexp
        ratio = float(readsexp)/readsctrl
        try:
            TAsites = int(split[1])
            length = int(split[7])
            rawratio = float(split[6])
            #print split[6]
            index = float(readsexp)/length
            rawfit = ratio*index
            gene_dict[split[0]] = [TAsites,length,int(split[2]),int(split[3]),rawratio,ratio,index,rawfit]
        except Exception:
            pass
        # gene_dict = readsctrl, readsexp, ratioexp, indexexp, unnormalized fitness value
        #index = reads/length of gene
#print gene_dict

#Add corrected pval from fdr file into dictionary
#0-#TA sites, 1-length, 2-readsctrl,3-readsexp, 4-rawratio, 5-modratio, 6-index, 7-rawfitscore, 8-corrpval
for line in open(sys.argv[2]):
    split = line.split('\t')
    if split[0] in gene_dict:
        gene_dict[split[0]].append(float(split[1].rstrip()))

#print gene_dict

#Make fitvals a list instead of a dict for calculating norm-fit value
fitvals = []
for k,v in gene_dict.iteritems():
    fitvals.append([v[7],k])

fitvals.sort()
count=0
for f in fitvals:
    f.append(count)
    count+=1
    normorder = float(count)/len(fitvals)
    f.append(normorder)
#print fitvals

#put normalized fitval into genedict
#0-#TA sites, 1-length, 2-readsctrl,3-readsexp, 4-rawratio, 5-modratio, 6-index, 7-rawfitscore, 8-corrpval, 9-normfitval
for f in fitvals:
    gene = f[1]
    if gene in gene_dict:
        gene_dict[gene].append(f[3])
#print gene_dict

genes = gene_dict.keys()
genes.sort()

for g in genes:
    #print len(gene_dict[g])
    if len(gene_dict[g]) > 6:
        print "%s,%d,%d,%d,%d,%0.10f,%0.100f,%0.10f,%0.10f,%0.6f" % (g,gene_dict[g][0],gene_dict[g][1],gene_dict[g][2],gene_dict[g][3],gene_dict[g][4],gene_dict[g][8],gene_dict[g][5],gene_dict[g][6],gene_dict[g][9])

        #genename,#TAsites,length,readsctrl,readsexp,rawratio,corrpval,modratio,index,normfitscore
        












